In this section we’ll continue using CRC dataset.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
CRC <- read.csv("./data/CRC_train.csv")
NCI60 <- read.csv("./data/NCI60.csv")
names(NCI60)
## [1] "BR_BT549_a" "BR_BT549_b" "BR_HS578T_a" "BR_HS578T_b"
## [5] "BR_MCF7_a" "BR_MCF7_b" "BR_MDAMB231_a" "BR_MDAMB231_b"
## [9] "BR_MDAMB468_a" "BR_MDAMB468_b" "BR_T47D_a" "BR_T47D_b"
## [13] "CNS_SF268_a" "CNS_SF268_b" "CNS_SF295_a" "CNS_SF295_b"
## [17] "CNS_SF539_a" "CNS_SF539_b" "CNS_SNB19_a" "CNS_SNB19_b"
## [21] "CNS_SNB75_a" "CNS_SNB75_b" "CNS_U251_a" "CNS_U251_b"
## [25] "CO_COLO205_a" "CO_COLO205_b" "CO_HCC2998_a" "CO_HCC2998_b"
## [29] "CO_HCT116_a" "CO_HCT116_b" "CO_HCT15_a" "CO_HCT15_b"
## [33] "CO_HT29_a" "CO_HT29_b" "CO_KM12_a" "CO_KM12_b"
## [37] "CO_SW620_a" "CO_SW620_b" "LC_A549_a" "LC_A549_b"
## [41] "LC_EKVX_a" "LC_EKVX_b" "LC_HOP62_a" "LC_HOP62_b"
## [45] "LC_HOP92_a" "LC_HOP92_b" "LC_NCIH226_a" "LC_NCIH226_b"
## [49] "LC_NCIH23_a" "LC_NCIH23_b" "LC_NCIH322M_a" "LC_NCIH322M_b"
## [53] "LC_NCIH460_a" "LC_NCIH460_b" "LC_NCIH522_a" "LC_NCIH522_b"
## [57] "LE_CCRFCEM_a" "LE_CCRFCEM_b" "LE_HL60_a" "LE_HL60_b"
## [61] "LE_K562_a" "LE_K562_b" "LE_MOLT4_a" "LE_MOLT4_b"
## [65] "LE_RPMI8226_a" "LE_RPMI8226_b" "LE_SR_a" "LE_SR_b"
## [69] "ME_LOXIMVI_a" "ME_LOXIMVI_b" "ME_M14_a" "ME_M14_b"
## [73] "ME_MALME3M_a" "ME_MALME3M_b" "ME_MDAMB435_a" "ME_MDAMB435_b"
## [77] "ME_SKMEL2_a" "ME_SKMEL2_b" "ME_SKMEL28_a" "ME_SKMEL28_b"
## [81] "ME_SKMEL5_a" "ME_SKMEL5_b" "ME_UACC257_a" "ME_UACC257_b"
## [85] "ME_UACC62_a" "ME_UACC62_b" "OV_IGROV1_a" "OV_IGROV1_b"
## [89] "OV_NCIADRRES_a" "OV_NCIADRRES_b" "OV_OVCAR3_a" "OV_OVCAR3_b"
## [93] "OV_OVCAR4_a" "OV_OVCAR4_b" "OV_OVCAR5_a" "OV_OVCAR5_b"
## [97] "OV_OVCAR8_a" "OV_OVCAR8_b" "OV_SKOV3_a" "OV_SKOV3_b"
## [101] "PR_DU145_a" "PR_DU145_b" "PR_PC3_a" "PR_PC3_b"
## [105] "RE_7860_a" "RE_7860_b" "RE_A498_a" "RE_A498_b"
## [109] "RE_ACHN_a" "RE_ACHN_b" "RE_CAKI1_a" "RE_CAKI1_b"
## [113] "RE_RXF393_a" "RE_RXF393_b" "RE_SN12C_a" "RE_SN12C_b"
## [117] "RE_TK10_a" "RE_TK10_b" "RE_UO31_a" "RE_UO31_b"
## [121] "Protein"
Recall the basic scatter plot we made before
# Basic scatter plot
p <- ggplot(CRC, aes(x = SERPINA3, y = TIMP1))
p + geom_point()
geom_jitter() adds a small amount of random variation to the location of each point, and is a useful way of handling overplotting caused by discreteness in smaller datasets
# moves each point by a small, random amount
p + geom_jitter(width = 0.25)
But geom_jitter() doesn’t work well for larger datasets
s <- ggplot(NCI60, aes(BR_BT549_a, BR_HS578T_a))
s + geom_point()
s + geom_jitter()
We can change the shape from solid to hollow circles
s + geom_point(shape = 1)
Or change the point to pixel size
s + geom_point(shape = ".")
We can also use alpha blending (transparency) to make the points transparent. If we specify alpha as a ratio, the denominator gives the number of points that must be overplotted to give a solid colour.
s + geom_point(alpha = 1 / 3)
s + geom_point(alpha = 1 / 5)
s + geom_point(alpha = 1 / 10)
First, we have to calculate the height of each bar manually and sort the bar height. Then make bar plot with stat="identity"
g <- ggplot(CRC, aes(Sub_group))
g + geom_bar()
subgroup <- CRC %>% group_by(Sub_group) %>% summarise(n = n()) # count the number of samples for each sub group
subgroup
## # A tibble: 3 × 2
## Sub_group n
## <fctr> <int>
## 1 Benign 34
## 2 CRC 100
## 3 Healthy 66
subgroup <- subgroup[order(subgroup$n), ] # sort
subgroup$Sub_group <- factor(subgroup$Sub_group, levels = subgroup$Sub_group) # to retain the order in plot.
subgroup
## # A tibble: 3 × 2
## Sub_group n
## <fctr> <int>
## 1 Benign 34
## 2 Healthy 66
## 3 CRC 100
ggplot(subgroup, aes(x=Sub_group, y=n)) +
geom_bar(stat="identity")
We can change the limit of x-axis and y-axis to see the plot details.
h <- ggplot(CRC, aes(SERPINA3))
# Change the bar width
h + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
h + geom_histogram(binwidth = 0.1)
# Change the limit of x-axis and y-axis
h + geom_histogram(binwidth = 0.1) +
coord_cartesian(xlim = c(12.5, 16))
We can also color specific point in scatter plot to see how it looks like
ggplot(data = CRC) +
geom_point(mapping = aes(x = SERPINA3, y = TIMP1))
CRC2 <- CRC
highlight.sample <- "P1D2"
CRC2$highlight <- ifelse(CRC2$Sample == highlight.sample, "highlight", "normal")
textdf <- CRC2[CRC2$Sample == highlight.sample, ]
mycolours <- c("highlight" = "red", "normal" = "grey50")
ggplot(data = CRC2, aes(x = SERPINA3, y = TIMP1)) +
geom_point(size = 3, aes(colour = highlight)) +
scale_color_manual("Sample", values = mycolours) +
geom_text(data = textdf, aes(x = SERPINA3, y = TIMP1* 0.99, label = highlight.sample), colour = "red")
p1 <- ggplot(data = CRC) +
geom_point(mapping = aes(x = SERPINA3, y = TIMP1, color = Sub_group))
p1 + labs(title = "Compare between sub groups",
subtitle = "Benign samples are mixed with the other two groups",
caption = "Data vis example")
# Axis labels and legend titles
p1 + labs(x = "Protein SERPINA3", y = "Protein TIMP1", color = "Sub groups")
We can change the plot background by theme. Reference: http://ggplot2.tidyverse.org/reference/theme.html
p1 + theme_grey()
p1 + theme_classic()
p1 + theme_dark()
p1 + theme_light()
p1 + theme_void()
p1 + theme(panel.background = element_rect(fill = "white", colour = "grey50"))
We can further change the appearance and the orientation angle of title and axis labels
CRC.two.prot <- CRC[,c("SERPINA3","TIMP1","Sample")]
plot.data <- CRC.two.prot[1:20,] %>% gather(Protein, Abundance, -Sample)
p2 <- ggplot(plot.data) +
geom_line(aes(x=Sample, y = Abundance, group = Protein, colour=Protein))
p2 + theme(plot.title = element_text(size=20, colour="darkblue"),
axis.text.x = element_text(face="bold", color="blue", size=10, angle=45),
axis.title.x = element_text(face="bold", colour="#990000", size=20),
axis.text.y = element_text(face="bold", color="blue", size=14),
axis.title.y = element_text(face="bold", colour="#990000", size=20))
# Hide x an y axis tick mark labels
p2 + theme(
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank())
# Remove axis ticks and tick mark labels
p2 + theme(
axis.ticks = element_blank())
Adjust Legend position
p2 + theme(legend.position = "right") # the default
p2 + theme(legend.position = "bottom")
p2 + theme(legend.position = "none")
Facetting creates tables of graphics by splitting the data into subsets and displaying the same graph for each subset
p3 <- ggplot(CRC) +
geom_point(aes(x = SERPINA3, y = TIMP1)) +
facet_grid(~ Group)
p3
p3 + theme(strip.background = element_rect(colour = "white", fill = "yellow"))
A scale function exists for each aesthetic. We can change color themes
p1 + scale_color_grey()
p1 + scale_color_brewer(palette = "Set1")
We can also change the axis scales
p1 + scale_x_log10()
p1 + scale_y_log10()
p1 + scale_y_reverse()
Change the limit of x-axis
p1 + scale_x_continuous(name="Protein SERPINA3", limits=c(12, 16)) +
scale_y_continuous(name="Protein TIMP1")
## Warning: Removed 15 rows containing missing values (geom_point).